#################################################
# EXPLORATORY DATA ANALYSIS of Air Traffic Data #
#################################################
# (and learning Pandas and Plotly libraries)
#__course__ = 'CS131A:Wostner:Fall2016'
#__author__ = 'mvarga2'
#__date__ = '2016-12-18'
# source file: 704969640_T_ONTIME_2016_10.csv
# data dictionary file: 704969640_T_ONTIME_ReadMe.csv
# period: October 2016
# data provider: Bureau of Transportation Statistics
# Table Of Contents:
#
# 1. Getting a sneak peek at the data
# 2. Calculating the measures of center and spread
### A. Analysing SFO outbound flights
### B. Analysing SFO inbound flights
# 3. Graphing the results - offline
# 4. Cleaning up
# 5. Findings and Conclusion
# 1. Getting a sneak peek at the data
# importing pandas library
import pandas as pd
# reading in source data from a csv file into df_airdata Pandas dataframe object
# please, check the location of the source file and specify its path accordingly
df_airdata = pd.read_csv('~/air_traffic_data/704969640_T_ONTIME_2016_10.csv')
#del df_airdata
# reading in a list of airport codes
df_airports=pd.read_csv('~/air_traffic_data/L_AIRPORT_ID.csv')
# gettting to know our data (similar to unix head command)
df_airdata.head()
# getting to know our data (similar to unix tail command)
df_airdata.tail()
# getting the number of columns in our dataframe
len(df_airdata.columns)
# listing the names of all available columns
# df_airdata.columns
df_airdata.columns.values.tolist()
# getting the columns' datatypes
df_airdata.dtypes
# generating summary statistics for each column of the whole dataframe
df_airdata.describe()
# getting total number of rows in our dataframe
len(df_airdata.index)
# printing first 5 rows of the airport codes dataframe - 5 is the default for both head and tail commands
df_airports.head()
# looking up a unique code representing San Francisco International Airport (SFO)
airport_code = (df_airports[df_airports['Description']
.str.contains("Francisco International")]
.iloc[0]['Code']
)
print(airport_code)
# getting the number of records(rows) with only flights terminating at SFO
len(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code].index)
#df_airports[df_airports['Description'].str.contains("Francisco, CA")].iloc[:,[0,1]]
# getting the number of records with only flights originating at SFO
len(df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code].index)
# printing only columns with specified indexes using iloc method
df_airdata.iloc[:,[0,3,5]].head()
# counting number of values in each column. note, that some columns have missing values for some of the rows,
# hence the counts are different for the columns
df_airdata.count()
# reporting the minimum flight date in our dataset (we are working with October data only)
df_airdata['FL_DATE'].min()
# reporting the maximum flight date in our dataset
df_airdata['FL_DATE'].max()
# getting a quick glance at the data related to SFO as a destination airport using "isin" method and a list of values
airport_ids = [airport_code,'']
df_airdata.loc[df_airdata['DEST_AIRPORT_ID'].isin(airport_ids)].head()
# 2. Calculating the measures of center and spread
# A. Analysing SFO outbound flights
# (i.e. flighs originating at SFO)
# finding the mean (average) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.mean()
)
# finding the standard deviation value of departure delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.std()
)
# finding the median (middle) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.median()
)
# finding the mode (most frequent) value of departure delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.mode()
)
# finding the mean (average) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.mean()
)
# finding the standard deviation value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.std()
)
# finding the median (middle) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.median()
)
# finding the mode (most frequent) value of arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.mode()
)
# finding the outliers - maximum departure/arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
.max()
)
# finding the outliers - minimum departure/arrival delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
.min()
)
# viewing first 10 rows sorted by arrival delay descending for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head(10)
)
# viewing last 10 rows sorted by arrival delay for SFO outbound flights (in minutes)
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.sort_values(by=['ARR_DELAY'], ascending=[1])
.head(10)
)
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending
df_delays_out_desc = (df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code ][['CARRIER','DEP_DELAY','ARR_DELAY']]
#.groupby(['CARRIER'], as_index = False)
.groupby(['CARRIER'])
.mean()
.reset_index()
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head(5) # 5 is default
)
df_delays_out_desc
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - ascending
# displaying first five airlines with shortest average delays
# if negative, the flights departed/arrived on average earlier than scheduled
df_delays_out_asc = (df_airdata.loc[df_airdata['ORIGIN_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.groupby(['CARRIER'], as_index = False)
.mean()
.round(2)
.sort_values(by=['ARR_DELAY'], ascending=[1])
.head())
df_delays_out_asc
# B. Analysing SFO inbound flights
# (i.e. flights terminating at SFO)
# finding the mean (average) value of arrival delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.mean()
)
# finding the standard deviation value of arrival delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.std()
)
# finding the median value of arrival delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.median()
)
# finding the mode value of departure delay for SFO outbound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.mode()
)
# finding the mean value of departure delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.mean()
)
# finding the standard deviation value of departure delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.std()
)
# finding the median value of departure delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.median()
)
# finding the mode value of departure delay for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['DEP_DELAY']
.mode()
)
# summing up the average delay of flights headed to SFO (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
.mean()
.sum()
)
# this number does not make sense, please ignore
# finding the outliers - maximum arrival delay of SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.max()
)
# finding the outliers - minimum arrival delay of SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code]['ARR_DELAY']
.min()
)
# printing maximum values of both departure and arrival delays for SFO bound flights (in minutes)
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
.max()
)
# printing mininum values of both departure and arrival delays for SFO bound flights (in minutes)
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['DEP_DELAY','ARR_DELAY']]
.min()
)
# viewing first 10 rows sorted by arrival delay descending
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head(10)
)
# viewing last 10 rows sorted by arrival delay
# if negative, the flight departed/arrived earlier than scheduled
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.sort_values(by=['ARR_DELAY'], ascending=[1])
.head(10)
)
# computing departure and arrival delay averages by airline
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.groupby(['CARRIER'])
.mean()
#.reset_index()
)
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending
df_delays_in_desc = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
#.groupby(['CARRIER'], as_index = False)
.groupby(['CARRIER'])
.mean()
.reset_index()
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head(5) # 5 is default
)
df_delays_in_desc
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - descending
# displaying first five airlines with longest average delays
# second option is to use as_index = False clause instead of resetting index
df_delays_in_desc = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.groupby(['CARRIER'], as_index = False)
.mean()
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head()
)
df_delays_in_desc
# computing departure and arrival delay averages by airline, sorting the output by arrival delay - ascending
# displaying first five airlines with shortest average delays
# if negative, the flights departed/arrived on average earlier than scheduled
df_delays_in_asc = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY']]
.groupby(['CARRIER'], as_index = False)
.mean()
.round(2)
.sort_values(by=['ARR_DELAY'], ascending=[1])
.head())
#testdf['CARRIER','DEP_DELAY']
df_delays_in_asc
# finding duration of arrival delay causes in minutes
(df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
.sum()
#.reset_index()
)
# assigning sums of arrival delay causes to variables
arr_del,car_del,wea_del,nas_del,sec_del,late_del = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
.sum()
)
# alternatively we can create a list with columns values as its elements using .tolist() method on our df_airdata dataframe
arr_del_sums = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][['ARR_DELAY','CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
.sum()
.tolist()
)
arr_del_sums
# confirming the sum of all sums of specific delays matches that of overall arrival delay
(df_airdata
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][df_airdata['SECURITY_DELAY'] >= 0.0][[ 'CARRIER_DELAY','WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']]
.sum()
.sum()
)
# 3. Graphing the results - offline
import plotly
from plotly.graph_objs import Scatter, Layout, Bar
plotly.offline.init_notebook_mode()
layout = Layout(
title='Average SFO Arrival Delay by Airline',
xaxis=dict(
title='Airlines',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
),
yaxis=dict(
title='Arrival delay (in minutes)',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
)
)
data = [Bar(x=df_delays_in_desc['CARRIER'], y=df_delays_in_desc['ARR_DELAY'])]
plotly.offline.iplot({ "data": data, "layout":layout})
#py.iplot(data, filename='basic-bar')
import plotly
from plotly.graph_objs import Scatter, Layout, Bar
plotly.offline.init_notebook_mode()
layout = Layout(
title='Average SFO Departure Delay by Airline',
xaxis=dict(
title='Airlines',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
),
yaxis=dict(
title='Departure delay (in minutes)',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
)
)
data = [Bar(x=df_delays_out_desc['CARRIER'], y=df_delays_out_desc['DEP_DELAY'])]
plotly.offline.iplot({ "data": data, "layout":layout})
#py.iplot(data, filename='basic-bar')
import plotly.plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()
fig = {
'data': [{'labels': ['Carrier Delay', 'Weather Delay', 'NAS Delay', 'Security Delay', 'Late Aircraft Delay'],
'values': [car_del/arr_del, wea_del/arr_del, nas_del/arr_del, sec_del/arr_del, late_del/arr_del],
'type': 'pie'}],
'layout': {'title': 'Distribution of SFO Arrival Delay Causes - October 2016'}
}
#plotly.offline.iplot({ "data": data, "layout":layout})
plotly.offline.iplot(fig)
# alternatively plotting the same graph using our list of arrival delay sums "arr_del_sums" instead
plotly.offline.init_notebook_mode()
mylist=list()
for index,element in enumerate(arr_del_sums):
if not index == 0:
#print(element/arr_del_sums[0])
mylist.append(element/arr_del_sums[0])
fig = {
'data': [{'labels': ['Carrier Delay', 'Weather Delay', 'NAS Delay', 'Security Delay', 'Late Aircraft Delay'],
# 'values': [arr_del_sums[1]/arr_del_sums[0], arr_del_sums[2]/arr_del_sums[0], arr_del_sums[3]/arr_del_sums[0], arr_del_sums[4]/arr_del_sums[0], arr_del_sums[5]/arr_del_sums[0]],
'values': mylist,
'type': 'pie'}],
'layout': {'title': 'Distribution of SFO Arrival Delay Causes - October 2016'}
}
#plotly.offline.iplot({ "data": data, "layout":layout})
plotly.offline.iplot(fig)
# creating a subset of data depicting airlines terminating at SFO ordered by average arrival delay descending
# alongside with number of inbound flights
df_airdata_sub = (df_airdata.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','ARR_DELAY']]
.groupby(['CARRIER'], as_index = True)
.agg(['mean','count'])
.sort_values(by=[('ARR_DELAY','mean')], ascending=[0])
)
df_airdata_sub.columns = df_airdata_sub.columns.droplevel(0)
df_airdata_sub = df_airdata_sub.reset_index()
df_airdata_sub
# creating a subset of 10 flights with longest delays alonside with carrier name and day of month of each flight
df_top_delay = (df_airdata
#.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','DEP_DELAY','ARR_DELAY','DAY_OF_MONTH']]
.loc[df_airdata['DEST_AIRPORT_ID'] == airport_code][['CARRIER','ARR_DELAY','DAY_OF_MONTH']]
.sort_values(by=['ARR_DELAY'], ascending=[0])
.head(10)
)
df_top_delay
# merging these two subsets together by CARRIER name.
df_airdata_sub_out = df_airdata_sub.merge(df_top_delay, on='CARRIER').sort_values(by=['ARR_DELAY'], ascending=[0])
df_airdata_sub_out
# printing a bubble chart
import plotly
from plotly.graph_objs import Scatter, Layout
plotly.offline.init_notebook_mode()
data = [
{
'x': df_airdata_sub_out['DAY_OF_MONTH'],
'y': df_airdata_sub_out['ARR_DELAY'],
'text': df_airdata_sub_out['CARRIER'],
'mode': 'markers',
'marker': {
'color': [120, 125, 130, 135, 140, 145, 150, 155, 160, 165 ],
#'size': [15, 30, 55, 70, 90, 110,5, 30, 55, 70, 90],
'size': df_airdata_sub_out['mean'] ,
'sizeref':0.25 #,
# 'showscale': True
}
}
]
layout = Layout(
title=' Ten Longest Arrival Delays (in minutes)',
xaxis=dict(
title='Day Of Month',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
),
yaxis=dict(
title='Arrival delay (in minutes)',
titlefont=dict(
family='Courier New, monospace',
size=14,
color='#7f7f7f'
)
)
)
plotly.offline.iplot({ "data": data, "layout":layout})
# 4. Cleaning up
del df_airdata
del df_delays_in_desc
del df_delays_in_asc
del df_delays_out_desc
del df_delays_out_asc
del df_airdata_sub
del df_top_delay
df_airdata_sub_out
# 5. Findings and Conclusion
# We have analyzed air traffic dataset for October 2016 provided by Bureau of Transportation Statistics
# available @http://www.transtats.bts.gov. We have only dealt with a subset of the data related to San Francisco
# International Airport(SFO).
#
# Here are some of the findigs:
#
# 1. On average a flight originating from SFO in October 2016 had a delay of 15 minutes but
# on average it arrived with only a 7.5 minute delay to its destination.
#
# 2. The airline departing from SFO with the highest average arrival delay of 18 minutes is Frontier Airlines
# closely followed by Southwest Airlines. On the other hand Delta Airlines has the lowest arrival delay average
# in October 2016. In fact, on average it arrives ahead of time. American Airlines finished 2nd in terms of flight
# accuracy.
#
# 3. Flights terminating at SFO have a delay of 19 minutes on average.
#
# 4. The top average arrival delay of 29 minutes has been recorded again for Frontier Airlines.
#
# 5. The lowest average arrival delay of 1.26 minutes for flights terminating at SFO was accomplished by Hawaiian Airlines,
# Delta placed 2nd.
#
# 6. The NAS(National Airspace System) can be blamed for the majority of SFO arrival delays, on average 62 percent of delays
# can be attributed to NAS
#
# Please, bear in mind that these findings may not be significant over longer period of time